{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Chicago Crime Prediction\n",
    "\n",
    "The model forecasts how many crimes are expected to be reported the next day, based on how many were reported over the previous `n` days."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%%capture\n",
    "\n",
    "%pip install --upgrade pip\n",
    "%pip install --upgrade seaborn\n",
    "%pip install --upgrade numpy\n",
    "%pip install --upgrade pandas\n",
    "%pip install --upgrade \"tensorflow<2\"\n",
    "%pip install --upgrade scikit-learn\n",
    "%pip install --upgrade google-cloud-bigquery"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from math import sqrt\n",
    "\n",
    "import numpy as np\n",
    "\n",
    "import pandas as pd\n",
    "from pandas.plotting import register_matplotlib_converters\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "register_matplotlib_converters()\n",
    "\n",
    "import seaborn as sns\n",
    "\n",
    "from sklearn.preprocessing import RobustScaler\n",
    "from sklearn.metrics import mean_squared_error\n",
    "\n",
    "import tensorflow as tf\n",
    "from tensorflow import keras\n",
    "from tensorflow.keras.layers import LSTM, Dense, Dropout\n",
    "\n",
    "import warnings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from google.cloud import bigquery\n",
    "\n",
    "sql = \"\"\"\n",
    "SELECT count(*) as count, TIMESTAMP_TRUNC(date, DAY) as day\n",
    "FROM `bigquery-public-data.chicago_crime.crime`\n",
    "GROUP BY day\n",
    "ORDER BY day\n",
    "\"\"\"\n",
    "\n",
    "client = bigquery.Client()\n",
    "df = client.query(sql).result().to_dataframe()\n",
    "\n",
    "df.index = df.day\n",
    "df = df[['count']]\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Visualize data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.figure(figsize=(20, 6))\n",
    "with warnings.catch_warnings():\n",
    "    warnings.simplefilter(\"ignore\")\n",
    "    sns.lineplot(data=df).set_title('Daily Crime Reports')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Preprocess data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Split dataset into sequences of previous values and current values\n",
    "# For example, given a dataset: [1, 2, 3, 4, 5] and a window size of 2:\n",
    "# data_X = [[1, 2], [2, 3], [3, 4]]\n",
    "# data_y = [3, 4, 5]\n",
    "def create_dataset(dataset, window_size = 1):\n",
    "    data_X, data_y = [], []\n",
    "    df = pd.DataFrame(dataset)\n",
    "    columns = [df.shift(i) for i in reversed(range(1, window_size+1))]\n",
    "    data_X = pd.concat(columns, axis=1).dropna().values\n",
    "    data_y = df.shift(-window_size).dropna().values\n",
    "    return data_X, data_y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The % of data we should use for training\n",
    "TRAINING_SPLIT = 0.8\n",
    "# The # of observations to use to predict the next observation\n",
    "WINDOW_SIZE = 7\n",
    "\n",
    "def preprocess_data(df, window_size):\n",
    "    # Normalize inputs to improve learning process\n",
    "    scaler = RobustScaler()\n",
    "\n",
    "    # Time series: split latest data into test set\n",
    "    train = df.values[:int(TRAINING_SPLIT * len(df)), :]\n",
    "    train = scaler.fit_transform(train)\n",
    "    test = df.values[int(TRAINING_SPLIT * len(df)):, :]\n",
    "    test = scaler.transform(test)\n",
    "\n",
    "    # Create test and training sets\n",
    "    train_X, train_y = create_dataset(train, window_size)\n",
    "    test_X, test_y = create_dataset(test, window_size)\n",
    "\n",
    "    # Reshape input data\n",
    "    train_X = np.reshape(train_X, (train_X.shape[0], 1, train_X.shape[1]))\n",
    "    test_X = np.reshape(test_X, (test_X.shape[0], 1, test_X.shape[1]))\n",
    "\n",
    "    return train_X, train_y, test_X, test_y, scaler\n",
    "\n",
    "train_X, train_y, test_X, test_y, scaler = preprocess_data(df, WINDOW_SIZE)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def input_fn(features, labels, shuffle, num_epochs, batch_size):\n",
    "    \"\"\"Generates an input function to be used for model training.\n",
    "\n",
    "    Args:\n",
    "      features: numpy array of features used for training or inference\n",
    "      labels: numpy array of labels for each example\n",
    "      shuffle: boolean for whether to shuffle the data or not (set True for\n",
    "        training, False for evaluation)\n",
    "      num_epochs: number of epochs to provide the data for\n",
    "      batch_size: batch size for training\n",
    "\n",
    "    Returns:\n",
    "      A tf.data.Dataset that can provide data to the Keras model for training or\n",
    "        evaluation\n",
    "    \"\"\"\n",
    "\n",
    "    if labels is None:\n",
    "        inputs = features\n",
    "    else:\n",
    "        inputs = (features, labels)\n",
    "\n",
    "    dataset = tf.data.Dataset.from_tensor_slices(inputs)\n",
    "\n",
    "    if shuffle:\n",
    "        dataset = dataset.shuffle(buffer_size=len(features))\n",
    "\n",
    "    # We call repeat after shuffling, rather than before, to prevent separate\n",
    "    # epochs from blending together.\n",
    "    dataset = dataset.repeat(num_epochs)\n",
    "    dataset = dataset.batch(batch_size)\n",
    "    return dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_keras_model(input_dim, learning_rate, window_size):\n",
    "    \"\"\"Creates Keras model for regression.\n",
    "\n",
    "    Args:\n",
    "      input_dim: How many features the input has\n",
    "      learning_rate: Learning rate for training\n",
    "\n",
    "    Returns:\n",
    "      The compiled Keras model (still needs to be trained)\n",
    "    \"\"\"\n",
    "    \n",
    "    model = keras.Sequential([\n",
    "        LSTM(4, dropout = 0.2, input_shape = (input_dim, window_size)),\n",
    "        Dense(1)\n",
    "    ])\n",
    "\n",
    "    model.compile(loss='mean_squared_error', optimizer=tf.train.AdamOptimizer(\n",
    "        learning_rate=learning_rate))    \n",
    "    \n",
    "    return(model)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def train_and_evaluate(batch_size, learning_rate, num_epochs, window_size):\n",
    "    # Dimensions\n",
    "    num_train_examples, input_dim, _ = train_X.shape\n",
    "    num_eval_examples = test_X.shape[0]\n",
    "\n",
    "    # Create the Keras Model\n",
    "    keras_model = create_keras_model(\n",
    "        input_dim=input_dim, learning_rate=learning_rate, window_size=window_size)\n",
    "\n",
    "    # Pass a numpy array by passing DataFrame.values\n",
    "    training_dataset = input_fn(\n",
    "        features=train_X,\n",
    "        labels=train_y,\n",
    "        shuffle=False,\n",
    "        num_epochs=num_epochs,\n",
    "        batch_size=batch_size)\n",
    "\n",
    "    # Pass a numpy array by passing DataFrame.values\n",
    "    validation_dataset = input_fn(\n",
    "        features=test_X,\n",
    "        labels=test_y,\n",
    "        shuffle=False,\n",
    "        num_epochs=num_epochs,\n",
    "        batch_size=num_eval_examples)\n",
    "\n",
    "    # Train model\n",
    "    keras_model.fit(\n",
    "        training_dataset,\n",
    "        steps_per_epoch=int(num_train_examples / batch_size),\n",
    "        epochs=num_epochs,\n",
    "        validation_data=validation_dataset,\n",
    "        validation_steps=1,\n",
    "        verbose=1,\n",
    "        shuffle=False,\n",
    "    )\n",
    "    \n",
    "    return keras_model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "BATCH_SIZE = 256\n",
    "LEARNING_RATE = 0.01\n",
    "NUM_EPOCHS = 25\n",
    "\n",
    "model = train_and_evaluate(BATCH_SIZE, LEARNING_RATE, NUM_EPOCHS, WINDOW_SIZE)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluate model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def predict(model, X, y, scaler):\n",
    "    y_true = scaler.inverse_transform(y)\n",
    "    y_pred = scaler.inverse_transform(model.predict(X))\n",
    "    rmse = sqrt(mean_squared_error(y_true, y_pred))\n",
    "    return y_pred, rmse\n",
    "\n",
    "train_predict, _ = predict(model, train_X, train_y, scaler)\n",
    "test_predict, rmse = predict(model, test_X, test_y, scaler)\n",
    "\n",
    "model.evaluate(train_X, train_y)\n",
    "print(rmse)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Plot predictions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create new dataframe with similar indexes and columns to store prediction array\n",
    "df_test_predict = pd.DataFrame().reindex_like(df)\n",
    "# Assign test predictions to end of dataframe\n",
    "df_test_predict['count'][len(train_predict) + (WINDOW_SIZE * 2):len(df)] = np.squeeze(test_predict)\n",
    "# Append the test predictions to the end of the existing dataframe, while renaming the column to avoid collision\n",
    "df_combined = df.join(df_test_predict.rename(index=str, columns={'count':'predicted'}))\n",
    "\n",
    "# Plot the predicted vs actual counts\n",
    "plt.figure(figsize=(20, 6))\n",
    "with warnings.catch_warnings():\n",
    "    warnings.simplefilter(\"ignore\")\n",
    "    sns.lineplot(data=df_combined).set_title('Daily Crime Reports')\n",
    "plt.show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
